This code covers chapter 8 of “Introduction to Data Mining” by Pang-Ning Tan, Michael Steinbach and Vipin Kumar. See table of contents for code examples for other chapters.

CC This work is licensed under the Creative Commons Attribution 4.0 International License. For questions please contact Michael Hahsler.

ruspini_scaled data is in package cluster. It is a very simple data set with well separated clusters.

data(ruspini, package="cluster")

Shuffle rows

ruspini <- ruspini[sample(1:nrow(ruspini)),]
plot(ruspini)

Scale each column in the data to zero mean and unit standard deviation (z-scores). This prevents one attribute with a large range to dominate the others for the distance calculation.

ruspini_scaled <- scale(ruspini)
plot(ruspini_scaled)

Clustering methods

k-means Clustering

Assumes Euclidean distances. We use k=10 clusters and run the algorithm 10 times with random initialized centroids. The best result is returned.

km <- kmeans(ruspini_scaled, centers=4, nstart=10)
km
## K-means clustering with 4 clusters of sizes 23, 20, 17, 15
## 
## Cluster means:
##            x          y
## 1 -0.3595425  1.1091151
## 2 -1.1385941 -0.5559591
## 3  1.4194387  0.4692907
## 4  0.4607268 -1.4912271
## 
## Clustering vector:
## 57 19 33 62 52 28 37 13 74 39 61 43 55 35 70 49 17  3 47 30 36 12  2 25 27 
##  3  2  1  4  3  1  1  2  4  1  4  1  3  1  4  3  2  2  3  1  1  2  2  1  1 
## 16 65 24 26 42  8 64 66 11 22 72 44 68 51 63 15 50 18  4 14 67 32 59 48 45 
##  2  4  1  1  1  2  4  4  2  1  4  3  4  3  4  2  3  2  2  2  4  1  3  3  3 
## 75 56 53 23 31  9  6 41 46 54 69 34 58 40 71 10 38 73 20  5 29  7  1 21 60 
##  4  3  3  1  1  2  2  1  3  3  4  1  3  1  4  2  1  4  2  2  1  2  2  1  3 
## 
## Within cluster sum of squares by cluster:
## [1] 2.658679 2.705477 3.641276 1.082373
##  (between_SS / total_SS =  93.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
plot(ruspini_scaled, col=km$cluster)
points(km$centers, pch=3, cex=2) # this adds the centroids
text(km$centers, labels=1:4, pos=2) # this adds the cluster ID

Alternative plot from package cluster (uses principal components analysis for >2 dimensions)

library(cluster)
## 
## Attaching package: 'cluster'
## The following object is masked _by_ '.GlobalEnv':
## 
##     ruspini
clusplot(ruspini_scaled, km$cluster)

Inspect the centroids (cluster profiles)

km$centers
##            x          y
## 1 -0.3595425  1.1091151
## 2 -1.1385941 -0.5559591
## 3  1.4194387  0.4692907
## 4  0.4607268 -1.4912271
def.par <- par(no.readonly = TRUE) # save default, for resetting...
layout(t(1:4)) # 4 plots in one
for(i in 1:4) barplot(km$centers[i,], ylim=c(-2,2), main=paste("Cluster", i))

par(def.par)  #- reset to default

Find data for a single cluster

All you need is to select the rows corresponding to the cluster. The next example plots all data points of cluster 1

cluster1 <- ruspini_scaled[km$cluster==1,]
head(cluster1)
##             x         y
## 33 -0.3566917 1.0466240
## 28 -0.5533967 1.0466240
## 37 -0.1599867 1.0260913
## 39 -0.0944184 1.2314190
## 43  0.2662074 0.9644929
## 35 -0.2583392 1.1698207
plot(cluster1, xlim = c(-2,2), ylim = c(-2,2))

Try 10 clusters

plot(ruspini_scaled, col=kmeans(ruspini_scaled, centers=10)$cluster)

Hierarchical Clustering

dist defaults to method=“Euclidean”

d <- dist(ruspini_scaled)

We cluster using complete link

hc <- hclust(d, method="complete")

Dendrogram

plot(hc)
rect.hclust(hc, k=4)

plot(as.dendrogram(hc), leaflab="none") # plot dendrogram without leaf labels

More plotting options for dendrograms, including plotting parts of large dendrograms can be found here.

cluster_complete <- cutree(hc, k=4)
plot(ruspini_scaled, col=cluster_complete)

Try 10 clusters

plot(ruspini_scaled, col=cutree(hc, k=10))

Clustering with single link

hc_single <- hclust(d, method="single")
plot(hc_single)
rect.hclust(hc_single, k=4)

cluster_single <- cutree(hc_single, k=4)
plot(ruspini_scaled, col=cluster_single)

Try 10 clusters

plot(ruspini_scaled, col=cutree(hc_single, k=10))

Density-based clustering with DBSCAN

library(dbscan)

Parameters: minPts is often chosen as dimensionality of the data +1. Decide on epsilon using the knee in the kNN distance plot (seems to be around eps = .25).

kNNdistplot(ruspini_scaled, k = 3)
abline(h=.25, col="red")

run dbscan

db <- dbscan(ruspini_scaled, eps=.25, minPts=3)
db
## DBSCAN clustering for 75 objects.
## Parameters: eps = 0.25, minPts = 3
## The clustering contains 5 cluster(s) and 6 noise points.
## 
##  0  1  2  3  4  5 
##  6 12 19 20 15  3 
## 
## Available fields: cluster, eps, minPts
str(db)
## List of 3
##  $ cluster: int [1:75] 1 2 3 4 1 3 3 2 4 3 ...
##  $ eps    : num 0.25
##  $ minPts : num 3
##  - attr(*, "class")= chr [1:2] "dbscan_fast" "dbscan"
plot(ruspini_scaled, col=db$cluster+1L)

Note: 0 is not a color so we add 1 to cluster (0 is black now).

Alternative visualization from package dbscan

hullplot(ruspini_scaled, db)

Play with eps (neighborhood size) and MinPts (minimum of points needed for core cluster)

Gaussian Mixture Models

library(mclust)
## Package 'mclust' version 5.3
## Type 'citation("mclust")' for citing this R package in publications.

Mclust uses Bayesian Information Criterion (BIC) to find the number of clusters (model selection). BIC uses the likelihood and a penalty term to guard against overfitting.

m <- Mclust(ruspini_scaled)
summary(m)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------
## 
## Mclust EEI (diagonal, equal volume and shape) model with 5 components:
## 
##  log.likelihood  n df       BIC       ICL
##       -91.26485 75 16 -251.6095 -251.7486
## 
## Clustering table:
##  1  2  3  4  5 
## 14 20 23 15  3
plot(m, what = "classification")

Rerun with a fixed number of 4 clusters

m <- Mclust(ruspini_scaled, G=4)
summary(m)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------
## 
## Mclust EEI (diagonal, equal volume and shape) model with 4 components:
## 
##  log.likelihood  n df       BIC       ICL
##       -101.6027 75 13 -259.3327 -259.3356
## 
## Clustering table:
##  1  2  3  4 
## 17 20 23 15
plot(m, what = "classification")

Spectral clustering

library("kernlab")
cluster_spec <- specc(ruspini_scaled, centers = 4)
plot(ruspini_scaled, col=cluster_spec)

Internal Cluster Validation

Compare the Clustering Quality

Look at the within.cluster.ss and the avg.silwidth

#library(fpc)

Note: I do not load fpc since the NAMESPACE overwrites dbscan.

fpc::cluster.stats(d, km$cluster)
## $n
## [1] 75
## 
## $cluster.number
## [1] 4
## 
## $cluster.size
## [1] 23 20 17 15
## 
## $min.cluster.size
## [1] 15
## 
## $noisen
## [1] 0
## 
## $diameter
## [1] 1.1591436 1.1192822 1.4627043 0.8359025
## 
## $average.distance
## [1] 0.4286139 0.4824376 0.5805551 0.3564457
## 
## $median.distance
## [1] 0.3934100 0.4492432 0.5023855 0.3379729
## 
## $separation
## [1] 0.767612 1.157682 0.767612 1.157682
## 
## $average.toother
## [1] 2.148527 2.157193 2.293318 2.307969
## 
## $separation.matrix
##          [,1]     [,2]     [,3]     [,4]
## [1,] 0.000000 1.219930 0.767612 1.957726
## [2,] 1.219930 0.000000 1.339721 1.157682
## [3,] 0.767612 1.339721 0.000000 1.308435
## [4,] 1.957726 1.157682 1.308435 0.000000
## 
## $ave.between.matrix
##          [,1]     [,2]     [,3]     [,4]
## [1,] 0.000000 1.887363 1.924915 2.750174
## [2,] 1.887363 0.000000 2.771960 1.874198
## [3,] 1.924915 2.771960 0.000000 2.220011
## [4,] 2.750174 1.874198 2.220011 0.000000
## 
## $average.between
## [1] 2.219257
## 
## $average.within
## [1] 0.462697
## 
## $n.between
## [1] 2091
## 
## $n.within
## [1] 684
## 
## $max.diameter
## [1] 1.462704
## 
## $min.separation
## [1] 0.767612
## 
## $within.cluster.ss
## [1] 10.08781
## 
## $clus.avg.silwidths
##         1         2         3         4 
## 0.7454551 0.7211353 0.6812849 0.8073733 
## 
## $avg.silwidth
## [1] 0.7368082
## 
## $g2
## NULL
## 
## $g3
## NULL
## 
## $pearsongamma
## [1] 0.8415597
## 
## $dunn
## [1] 0.5247896
## 
## $dunn2
## [1] 3.228286
## 
## $entropy
## [1] 1.37327
## 
## $wb.ratio
## [1] 0.2084918
## 
## $ch
## [1] 323.5512
## 
## $cwidegap
## [1] 0.3152817 0.2611701 0.4149825 0.2351854
## 
## $widestgap
## [1] 0.4149825
## 
## $sindex
## [1] 0.8583457
## 
## $corrected.rand
## NULL
## 
## $vi
## NULL
#cluster.stats(d, cluster_complete)
#cluster.stats(d, cluster_single)

Read ? cluster.stats for an explanation of all the available indices.

sapply(list(
  km=km$cluster,
  hc_compl=cluster_complete,
  hc_single=cluster_single),
       FUN=function(x)
         fpc::cluster.stats(d, x))[c("within.cluster.ss","avg.silwidth"),]
##                   km        hc_compl  hc_single
## within.cluster.ss 10.08781  10.08781  10.08781 
## avg.silwidth      0.7368082 0.7368082 0.7368082

Silhouette plot

library(cluster)
plot(silhouette(km$cluster, d))

Note: The silhouette plot does not show correctly in R Studio if you have too many objects (bars are missing). I will work when you open a new plotting device with windows(), x11() or quartz().

Find Optimal Number of Clusters for k-means

plot(ruspini_scaled)

set.seed(1234)
ks <- 2:10

Within Sum of Squares

Use within sum of squares and look for the knee (nstart=5 repeats k-means 5 times and returns the best solution)

WSS <- sapply(ks, FUN=function(k) {
  kmeans(ruspini_scaled, centers=k, nstart=5)$tot.withinss
  })
plot(ks, WSS, type="l")
abline(v=4, col="red", lty=2)

Average Silhouette Width

Use average silhouette width (look for the max)

ASW <- sapply(ks, FUN=function(k) {
  fpc::cluster.stats(d, kmeans(ruspini_scaled, centers=k, nstart=5)$cluster)$avg.silwidth
  })
plot(ks, ASW, type="l")

ks[which.max(ASW)]
## [1] 4
abline(v=ks[which.max(ASW)], col="red", lty=2)

Dunn Index

Use Dunn index (another internal measure given by min. separation/ max. diameter)

DI <- sapply(ks, FUN=function(k) {
  fpc::cluster.stats(d, kmeans(ruspini_scaled, centers=k, nstart=5)$cluster)$dunn
})
plot(ks, DI, type="l")
ks[which.max(DI)]
## [1] 4
abline(v=ks[which.max(DI)], col="red", lty=2)

Gap Statistic

Compares the change in within-cluster dispersion with that expected from a null model (see ? clusGap). The default method is to choose the smallest k such that its value Gap(k) is not more than 1 standard error away from the first local maximum.

library(cluster)
k <- clusGap(ruspini_scaled, FUN = kmeans,  nstart = 10, K.max = 10)
k
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = ruspini_scaled, FUNcluster = kmeans, K.max = 10,     nstart = 10)
## B=100 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstSEmax', SE.factor=1): 4
##           logW   E.logW         gap     SE.sim
##  [1,] 3.497911 3.474301 -0.02360980 0.03835335
##  [2,] 3.073937 3.157027  0.08309024 0.04005118
##  [3,] 2.678004 2.908822  0.23081785 0.03767224
##  [4,] 2.106416 2.710845  0.60442835 0.03725234
##  [5,] 1.987213 2.571981  0.58476735 0.03699570
##  [6,] 1.863936 2.455285  0.59134824 0.03851437
##  [7,] 1.732234 2.350146  0.61791223 0.03754122
##  [8,] 1.640492 2.257624  0.61713194 0.03782124
##  [9,] 1.579299 2.172410  0.59311149 0.03841206
## [10,] 1.524184 2.094176  0.56999210 0.03782182
plot(k)

Note: these methods can also be used for hierarchical clustering.

There have been many other methods and indices proposed to determine the number of clusters. See, e.g., package NbClust.

Visualize the Distance Matrix

Visualizing the unordered distance matrix does not show much structure.

plot(ruspini_scaled)

d <- dist(ruspini_scaled)

library(seriation)
pimage(d)

Reorder using cluster labels

pimage(d, order=order(km$cluster))

Use dissplot which rearranges clusters, adds cluster labels, and shows average dissimilarity in the lower half of the plot.

dissplot(d, labels=km$cluster, options=list(main="k-means with k=4"))

dissplot(d, labels=db$cluster+1, options=list(main="DBSCAN"))

Spot the problem data points for DBSCAN (we use +1 so the noise is now cluster #1)

Misspecified k

dissplot(d, labels=kmeans(ruspini_scaled, centers=3)$cluster)

dissplot(d, labels=kmeans(ruspini_scaled, centers=9)$cluster)

External Cluster Validation

External cluster validation uses ground truth information. That is, the user has an idea how the data should be grouped. This could be a know class label not provided to the clustering algorithm.

We use an artificial data set with known groups here. First, we need to cluster the new data. We do k-means and hierarchical clustering.

library(mlbench)
shapes <- mlbench.smiley(n=500, sd1 = 0.1, sd2 = 0.05)
plot(shapes)

Prepare data

truth <- as.integer(shapes$class)
shapes <- scale(shapes$x)

plot(shapes)

Find optimal number of Clusters for k-means

ks <- 2:20

Use within sum of squares (look for the knee)

WSS <- sapply(ks, FUN=function(k) {
  kmeans(shapes, centers=k, nstart=10)$tot.withinss
})
plot(ks, WSS, type="l")

looks like 6 clusters

km <- kmeans(shapes, centers=6, nstart = 10)
plot(shapes, col=km$cluster)

Hierarchical clustering: single-link because of the mouth

d <- dist(shapes)
hc <- hclust(d, method="single")

Find optimal number of clusters

ASW <- sapply(ks, FUN=function(k) {
  fpc::cluster.stats(d, cutree(hc, k))$avg.silwidth
})
plot(ks, ASW, type="l")

4 clusters

hc_4 <- cutree(hc, 4)
plot(shapes, col=hc_4)

Spectral Clustering

library("kernlab")
spec <- specc(shapes, centers = 4)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 25000)
plot(shapes, col=spec)

Compare with ground truth with the corrected (=adjusted) Rand index (ARI), the variation of information (VI) index, entropy and purity.

#

define entropy and purity

entropy <- function(cluster, truth) {
  k <- max(cluster, truth)
  cluster <- factor(cluster, levels = 1:k)
  truth <- factor(truth, levels = 1:k)
  m <- length(cluster)
  mi <- table(cluster)

  cnts <- split(truth, cluster)
  cnts <- sapply(cnts, FUN = function(n) table(n))
  p <- sweep(cnts, 1, rowSums(cnts), "/")
  p[is.nan(p)] <- 0
  e <- -p * log(p, 2)
  sum(rowSums(e, na.rm = TRUE) * mi/m)
}

purity <- function(cluster, truth) {
  k <- max(cluster, truth)
  cluster <- factor(cluster, levels = 1:k)
  truth <- factor(truth, levels = 1:k)
  m <- length(cluster)
  mi <- table(cluster)

  cnts <- split(truth, cluster)
  cnts <- sapply(cnts, FUN = function(n) table(n))
  p <- sweep(cnts, 1, rowSums(cnts), "/")
  p[is.nan(p)] <- 0

  sum(apply(p, 1, max) * mi/m)
}

calculate measures (for comparison we also use random “clusterings” with 4 and 6 clusters)

random4 <- sample(1:4, nrow(shapes), replace = TRUE)
random6 <- sample(1:6, nrow(shapes), replace = TRUE)

r <- rbind(
  kmeans = c(
    unlist(fpc::cluster.stats(d, km$cluster, truth, compareonly = TRUE)),
    entropy = entropy(km$cluster, truth),
    purity = purity(km$cluster, truth)
    ),
  hc = c(
    unlist(fpc::cluster.stats(d, hc_4, truth, compareonly = TRUE)),
    entropy = entropy(hc_4, truth),
    purity = purity(hc_4, truth)
    ),
  spec = c(
    unlist(fpc::cluster.stats(d, spec, truth, compareonly = TRUE)),
    entropy = entropy(spec, truth),
    purity = purity(spec, truth)
    ),
  random4 = c(
    unlist(fpc::cluster.stats(d, random4, truth, compareonly = TRUE)),
    entropy = entropy(random4, truth),
    purity = purity(random4, truth)
    ),
  random6 = c(
    unlist(fpc::cluster.stats(d, random6, truth, compareonly = TRUE)),
    entropy = entropy(random6, truth),
    purity = purity(random6, truth)
    )
  )
r
##         corrected.rand        vi   entropy    purity
## kmeans     0.695694371 0.4382154 0.2631692 0.4844306
## hc         1.000000000 0.0000000 0.0000000 1.0000000
## spec       0.548508174 0.5566220 0.6673703 0.6780723
## random4    0.003296024 2.6665655 1.9788525 0.2923809
## random6   -0.002615134 3.0773033 1.6964185 0.1432732

Hierarchical clustering found the perfect clustering.

Read ? cluster.stats for an explanation of all the available indices.